#libraries needed
library(vegan)
library(cluster)
library(pvclust)
#datasets
envdata<-read.csv('env.csv',header=TRUE,row.names=1)
speabu<-read.csv('abund.csv',header=TRUE,row.names=1)
##Conduct non-hierarchical divisive clustering
#K-means (uses the actual data (euclidean distance only)
#standardize first
envdata.tra<-data.stand(envdata,method='standardize',margin='column',plot=F)
#now euclidean distance matrix
site.euc<-vegdist(envdata.tra,method='euclidean')
#determine the number of clusters using a scree plot with both dissimilarity and also silhouette width
nhclus.scree(site.euc,max.k=44)
#use K-means clustering specifying 8 clusters (determined from scree plot), and a high number of iterations (10000)
sitecl.kmeans<-kmeans(envdata.tra,centers=8,iter.max=10000,nstart=25)
#look at results
sitecl.kmeans
#extract specific results by referencing available components, for this example we're extracting site classifications
sitecl.kmeans$cluster
#can also create new object (table), with just the classification data
sitecl.class<-sitecl.kmeans$cluster
#if you want to see what types of data are extractable type names function
names(sitecl.kmeans)
#evaluating number of clusters using randomization
#write a script to do a randomization test
#below is the max # of clusters to consider,#of randomizations performed,#of descriptors in dataset
no.clus<-10
no.ran<-999
no.var<-10
#creates empty matrix to store within SSE (sum of squared error)
sse<-matrix(,nrow=no.ran+1,ncol=no.clus-1)
#also creates and empty matrix to store the randomized dataset
randata<-matrix(,nrow=nrow(envdata.tra),ncol=ncol(envdata.tra))
#for loop that repeats all commands between {}, a total of j times (where j is the number of clusters to consider starting with 2)
for ( j in 2:no.clus){
sitecl.kmeans<-kmeans(envdata.tra,centers=j, iter.max=1000, nstart=5)
sse[1,j-1]<-sum(sitecl.kmeans$withinss)
for (k in 1:no.ran) {
	for (c in 1:no.var) {
		randata[,c]<-envdata.tra[,c][sample(nrow(envdata.tra))]
	}
	ran.kmeans<-kmeans(randata,centers=j,iter.max=1000,nstart=5)
	sse[k+1,j-1]<-sum(ran.kmeans$withinss)
	}
}
diff<-colMeans(sse)-sse[1,]
plot(seq(2,no.clus,by=1),diff,xlab="No. clusters",ylab='mean(ranSSE)-actualSSE')